Build ENCODE RNA-Seq pipeline indices

1 Introduction

For more details, refer to ENCODE RNA-Seq pipeline.

2 Prepare necessary files

2.1 Convert GFF to GTF

gffread /data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gff3 -T -o /data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf

# or
singularity shell -B "${PWD}":/data /data/softwares/agat/agat_1.5.0--pl5321hdfd78af_0.sif
agat_convert_sp_gff2gtf.pl --gtf_version 3 --gff Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gff3 -o Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf
# sort GTF
# copy /usr/local/lib/perl5/site_perl/auto/share/dist/AGAT/agat_config.yaml to your working directory
# set 'output_format: GTF' and 'gtf_output_version: 3'
agat_convert_sp_gxf2gxf.pl -g Neovision_vision.v1.v2.KIZ_NCBI.basic.chr.annotation.gtf -o Neovision_vision.v1.v2.KIZ_NCBI.basic.chr.annotation.sorted.gtf -c agat_config.yaml

2.2 Merge annotations if needed

2.2.1 Prepare input JSON file

{
    "merge_anno.annotation": "gencode.v29.primary_assembly.annotation_UCSC_names.gtf.gz",
    "merge_anno.tRNA": "gencode.v29.tRNAs.gtf.gz",
    "merge_anno.spikeins": "ERCC_phiX.fa.gz",
    "merge_anno.output_filename": "merged_annotation_V29.gtf.gz"
}

2.2.2 Merge annotations

caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/merge_anno.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i merge_anno_input.json --max-concurrent-tasks 2 --singularity

3 Build STAR index

3.1 Prepare input JSON file

{
    "build_index.reference_sequence": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/sugarglider.fasta.gz",
    "build_index.spikeins": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/ERCC_phix.spikeins.fasta.gz",
    "build_index.annotation": "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf.gz",
    "build_index.anno_version": "v1",
    "build_index.genome": "sugarglider",
    "build_index.index_type": "prep_star",
    "build_index.ncpu": 60,
    "build_index.memGB": 512
}

3.2 Build STAR index

caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/build_genome_index.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i /data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_rna_seq/star_index_input.json --max-concurrent-tasks 1 --singularity

4 Build RSEM index

4.1 Prepare input JSON file

{
    "build_index.reference_sequence": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/sugarglider.fasta.gz",
    "build_index.spikeins": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/ERCC_phix.spikeins.fasta.gz",
    "build_index.annotation": "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf.gz",
    "build_index.anno_version": "v1",
    "build_index.genome": "sugarglider",
    "build_index.index_type": "prep_rsem",
    "build_index.ncpu": 60,
    "build_index.memGB": 512
}

4.2 Build RSEM index

caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/build_genome_index.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i /data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_rna_seq/rsem_index_input.json --max-concurrent-tasks 1 --singularity

5 Prepare other files

In most cases, you need to modify the following code to suit yourself.

5.1 Prepare transcript IDs to gene biotypes mapping table

library(rtracklayer)
library(tidyverse)
library(vroom)

# GTF only
# mandatory field: transcript_id
gtf_file <- "/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v1.NCBI_Pipeline.basic.chr.annotation.gtf.gz"
type_field <- "type"
transcript_type_values <- c("transcript")
gene_type_values <- c("gene")
# with higher precedence than transcript biotype field
gene_biotype_field <- "gene_biotype"
# all transcripts lacking gene biotype field
# or transcript biotype field will be assigned this value
gene_biotype_default <- "protein_coding"
# if there is no gene biotype field,
# and transcript biotype field exists,
# the transcript biotype field will be used
transcript_biotype_field <- "transcript_biotype"

df <- as.data.frame(rtracklayer::import(gtf_file, format = "gtf"))

if (any(gene_type_values %in% unique(df[[type_field]])) && gene_biotype_field %in% names(df)) {
    message("gene biotypes will be used")
    gene_df <- filter(df, .data[[type_field]] %in% gene_type_values) %>%
        select(all_of(c("gene_id", gene_biotype_field))) %>%
        na.omit() %>%
        distinct()

    transcript_df <- filter(df, .data[[type_field]] %in% transcript_type_values) %>%
        select(all_of(c("transcript_id", "gene_id"))) %>%
        na.omit() %>%
        distinct()
    transcript_df <- left_join(transcript_df, gene_df, by = "gene_id") %>%
        select(-gene_id) %>%
        distinct()
    transcript_df$gene_biotype[is.na(transcript_df$gene_biotype)] <- gene_biotype_default
} else if (transcript_biotype_field %in% names(df)) {
    message("transcript biotypes will be used")
    transcript_df <- filter(df, .data[[type_field]] %in% transcript_type_values) %>%
        select(all_of(c("transcript_id", transcript_biotype_field))) %>%
        na.omit() %>%
        distinct()
} else {
    message("the default gene biotype will be used for all transcripts")
    transcript_df <- filter(df, .data[[type_field]] %in% transcript_type_values) %>%
        select(transcript_id) %>%
        mutate(gene_biotype = gene_biotype_default) %>%
        na.omit() %>%
        distinct()
}

if (nrow(transcript_df) == 0) {
    stop("no transcripts found after filtering")
} else {
    vroom_write(
        transcript_df,
        file = gsub("(?i)(gff(3)?|gtf)(\\.gz)?$",
            "transcript_id_to_gene_type.tsv",
            gtf_file,
            perl = TRUE
        ),
        delim = "\t",
        col_names = FALSE,
        append = FALSE
    )
}

5.2 Prepare gene/transcript IDs to names mapping table

library(YRUtils)
library(vroom)

gff_file <- "/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v1.NCBI_Pipeline.basic.chr.annotation.gtf.gz"
target_type <- "transcript"

df <- extract_gene_id_from_gff(gff_file, target_type = target_type)
if (nrow(df) == 0) {
    stop("no valid items found")
} else {
    vroom_write(
        df,
        file = gsub("(?i)(gff(3)?|gtf)(\\.gz)?$",
            paste0(target_type, "_id_name_mapping_table.tsv"),
            gff_file,
            perl = TRUE
        ),
        col_names = TRUE, append = FALSE
    )
}